@article {2305, title = {Comparison of two Bayesian methods to detect mode effects between paper-based and computerized adaptive assessments: a preliminary Monte Carlo study.}, journal = {BMC Med Res Methodol}, volume = {12}, year = {2012}, month = {2012}, pages = {124}, abstract = {

BACKGROUND: Computerized adaptive testing (CAT) is being applied to health outcome measures developed as paper-and-pencil (P\&P) instruments. Differences in how respondents answer items administered by CAT vs. P\&P can increase error in CAT-estimated measures if not identified and corrected.

METHOD: Two methods for detecting item-level mode effects are proposed using Bayesian estimation of posterior distributions of item parameters: (1) a modified robust Z (RZ) test, and (2) 95\% credible intervals (CrI) for the CAT-P\&P difference in item difficulty. A simulation study was conducted under the following conditions: (1) data-generating model (one- vs. two-parameter IRT model); (2) moderate vs. large DIF sizes; (3) percentage of DIF items (10\% vs. 30\%), and (4) mean difference in \θ estimates across modes of 0 vs. 1 logits. This resulted in a total of 16 conditions with 10 generated datasets per condition.

RESULTS: Both methods evidenced good to excellent false positive control, with RZ providing better control of false positives and with slightly higher power for CrI, irrespective of measurement model. False positives increased when items were very easy to endorse and when there with mode differences in mean trait level. True positives were predicted by CAT item usage, absolute item difficulty and item discrimination. RZ outperformed CrI, due to better control of false positive DIF.

CONCLUSIONS: Whereas false positives were well controlled, particularly for RZ, power to detect DIF was suboptimal. Research is needed to examine the robustness of these methods under varying prior assumptions concerning the distribution of item and person parameters and when data fail to conform to prior assumptions. False identification of DIF when items were very easy to endorse is a problem warranting additional investigation.

}, keywords = {Bayes Theorem, Data Interpretation, Statistical, Humans, Mathematical Computing, Monte Carlo Method, Outcome Assessment (Health Care)}, issn = {1471-2288}, doi = {10.1186/1471-2288-12-124}, author = {Riley, Barth B and Carle, Adam C} } @article {320, title = {Considerations about expected a posteriori estimation in adaptive testing: adaptive a priori, adaptive correction for bias, and adaptive integration interval}, journal = {Journal of Applied Measurement}, volume = {10}, number = {2}, year = {2009}, note = {Raiche, GillesBlais, Jean-GuyUnited StatesJournal of applied measurementJ Appl Meas. 2009;10(2):138-56.}, pages = {138-56}, edition = {2009/07/01}, abstract = {In a computerized adaptive test, we would like to obtain an acceptable precision of the proficiency level estimate using an optimal number of items. Unfortunately, decreasing the number of items is accompanied by a certain degree of bias when the true proficiency level differs significantly from the a priori estimate. The authors suggest that it is possible to reduced the bias, and even the standard error of the estimate, by applying to each provisional estimation one or a combination of the following strategies: adaptive correction for bias proposed by Bock and Mislevy (1982), adaptive a priori estimate, and adaptive integration interval.}, keywords = {*Bias (Epidemiology), *Computers, Data Interpretation, Statistical, Models, Statistical}, isbn = {1529-7713 (Print)1529-7713 (Linking)}, author = {Raiche, G. and Blais, J. G.} } @article {52, title = {Improving patient reported outcomes using item response theory and computerized adaptive testing}, journal = {Journal of Rheumatology}, volume = {34}, number = {6}, year = {2007}, note = {Chakravarty, Eliza FBjorner, Jakob BFries, James FAr052158/ar/niamsConsensus Development ConferenceResearch Support, N.I.H., ExtramuralCanadaThe Journal of rheumatologyJ Rheumatol. 2007 Jun;34(6):1426-31.}, month = {Jun}, pages = {1426-31}, edition = {2007/06/07}, abstract = {OBJECTIVE: Patient reported outcomes (PRO) are considered central outcome measures for both clinical trials and observational studies in rheumatology. More sophisticated statistical models, including item response theory (IRT) and computerized adaptive testing (CAT), will enable critical evaluation and reconstruction of currently utilized PRO instruments to improve measurement precision while reducing item burden on the individual patient. METHODS: We developed a domain hierarchy encompassing the latent trait of physical function/disability from the more general to most specific. Items collected from 165 English-language instruments were evaluated by a structured process including trained raters, modified Delphi expert consensus, and then patient evaluation. Each item in the refined data bank will undergo extensive analysis using IRT to evaluate response functions and measurement precision. CAT will allow for real-time questionnaires of potentially smaller numbers of questions tailored directly to each individual{\textquoteright}s level of physical function. RESULTS: Physical function/disability domain comprises 4 subdomains: upper extremity, trunk, lower extremity, and complex activities. Expert and patient review led to consensus favoring use of present-tense "capability" questions using a 4- or 5-item Likert response construct over past-tense "performance"items. Floor and ceiling effects, attribution of disability, and standardization of response categories were also addressed. CONCLUSION: By applying statistical techniques of IRT through use of CAT, existing PRO instruments may be improved to reduce questionnaire burden on the individual patients while increasing measurement precision that may ultimately lead to reduced sample size requirements for costly clinical trials.}, keywords = {*Rheumatic Diseases/physiopathology/psychology, Clinical Trials, Data Interpretation, Statistical, Disability Evaluation, Health Surveys, Humans, International Cooperation, Outcome Assessment (Health Care)/*methods, Patient Participation/*methods, Research Design/*trends, Software}, isbn = {0315-162X (Print)}, author = {Chakravarty, E. F. and Bjorner, J. B. and Fries, J.F.} } @article {384, title = {Overview of quantitative measurement methods. Equivalence, invariance, and differential item functioning in health applications}, journal = {Medical Care}, volume = {44}, number = {11 Suppl 3}, year = {2006}, note = {Teresi, Jeanne AAG15294/AG/NIA NIH HHS/United StatesResearch Support, N.I.H., ExtramuralResearch Support, Non-U.S. Gov{\textquoteright}tReviewUnited StatesMedical careMed Care. 2006 Nov;44(11 Suppl 3):S39-49.}, month = {Nov}, pages = {S39-49}, edition = {2006/10/25}, abstract = {BACKGROUND: Reviewed in this article are issues relating to the study of invariance and differential item functioning (DIF). The aim of factor analyses and DIF, in the context of invariance testing, is the examination of group differences in item response conditional on an estimate of disability. Discussed are parameters and statistics that are not invariant and cannot be compared validly in crosscultural studies with varying distributions of disability in contrast to those that can be compared (if the model assumptions are met) because they are produced by models such as linear and nonlinear regression. OBJECTIVES: The purpose of this overview is to provide an integrated approach to the quantitative methods used in this special issue to examine measurement equivalence. The methods include classical test theory (CTT), factor analytic, and parametric and nonparametric approaches to DIF detection. Also included in the quantitative section is a discussion of item banking and computerized adaptive testing (CAT). METHODS: Factorial invariance and the articles discussing this topic are introduced. A brief overview of the DIF methods presented in the quantitative section of the special issue is provided together with a discussion of ways in which DIF analyses and examination of invariance using factor models may be complementary. CONCLUSIONS: Although factor analytic and DIF detection methods share features, they provide unique information and can be viewed as complementary in informing about measurement equivalence.}, keywords = {*Cross-Cultural Comparison, Data Interpretation, Statistical, Factor Analysis, Statistical, Guidelines as Topic, Humans, Models, Statistical, Psychometrics/*methods, Statistics as Topic/*methods, Statistics, Nonparametric}, isbn = {0025-7079 (Print)0025-7079 (Linking)}, author = {Teresi, J. A.} } @booklet {200, title = {Practical methods for dealing with {\textquoteright}not applicable{\textquoteright} item responses in the AMC Linear Disability Score project}, journal = {Health and Quality of Life Outcomes}, volume = {2}, year = {2004}, note = {Holman, RebeccaGlas, Cees A WLindeboom, RobertZwinderman, Aeilko Hde Haan, Rob JEnglandHealth Qual Life Outcomes. 2004 Jun 16;2:29.}, month = {Jun 16}, pages = {29}, type = {Comparative StudyResearch Support, Non-U.S. Gov{\textquoteright}t}, edition = {2004/06/18}, abstract = {BACKGROUND: Whenever questionnaires are used to collect data on constructs, such as functional status or health related quality of life, it is unlikely that all respondents will respond to all items. This paper examines ways of dealing with responses in a {\textquoteright}not applicable{\textquoteright} category to items included in the AMC Linear Disability Score (ALDS) project item bank. METHODS: The data examined in this paper come from the responses of 392 respondents to 32 items and form part of the calibration sample for the ALDS item bank. The data are analysed using the one-parameter logistic item response theory model. The four practical strategies for dealing with this type of response are: cold deck imputation; hot deck imputation; treating the missing responses as if these items had never been offered to those individual patients; and using a model which takes account of the {\textquoteright}tendency to respond to items{\textquoteright}. RESULTS: The item and respondent population parameter estimates were very similar for the strategies involving hot deck imputation; treating the missing responses as if these items had never been offered to those individual patients; and using a model which takes account of the {\textquoteright}tendency to respond to items{\textquoteright}. The estimates obtained using the cold deck imputation method were substantially different. CONCLUSIONS: The cold deck imputation method was not considered suitable for use in the ALDS item bank. The other three methods described can be usefully implemented in the ALDS item bank, depending on the purpose of the data analysis to be carried out. These three methods may be useful for other data sets examining similar constructs, when item response theory based methods are used.}, keywords = {*Disability Evaluation, *Health Surveys, *Logistic Models, *Questionnaires, Activities of Daily Living/*classification, Data Interpretation, Statistical, Health Status, Humans, Pilot Projects, Probability, Quality of Life, Severity of Illness Index}, isbn = {1477-7525 (Electronic)1477-7525 (Linking)}, author = {Holman, R. and Glas, C. A. and Lindeboom, R. and Zwinderman, A. H. and de Haan, R. J.} } @article {191, title = {Item response theory and health outcomes measurement in the 21st century}, journal = {Medical Care}, volume = {38}, number = {9 Suppl II}, year = {2000}, note = {204349670025-7079Journal Article}, pages = {II28-II42}, abstract = {Item response theory (IRT) has a number of potential advantages over classical test theory in assessing self-reported health outcomes. IRT models yield invariant item and latent trait estimates (within a linear transformation), standard errors conditional on trait level, and trait estimates anchored to item content. IRT also facilitates evaluation of differential item functioning, inclusion of items with different response formats in the same scale, and assessment of person fit and is ideally suited for implementing computer adaptive testing. Finally, IRT methods can be helpful in developing better health outcome measures and in assessing change over time. These issues are reviewed, along with a discussion of some of the methodological and practical challenges in applying IRT methods.}, keywords = {*Models, Statistical, Activities of Daily Living, Data Interpretation, Statistical, Health Services Research/*methods, Health Surveys, Human, Mathematical Computing, Outcome Assessment (Health Care)/*methods, Research Design, Support, Non-U.S. Gov{\textquoteright}t, Support, U.S. Gov{\textquoteright}t, P.H.S., United States}, author = {Hays, R. D. and Morales, L. S. and Reise, S. P.} }